/*
* Copyright (C) 2003-2009 eXo Platform SAS.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see<http://www.gnu.org/licenses/>.
*/
package org.exoplatform.services.jcr.analyzer;
import java.io.IOException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* Created by The eXo Platform SARL
* Author : Nguyen Van Chien
* chien.nguyen@exoplatform.com
* Jul 19, 2010
*/
public class UnescapeHTMLFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public UnescapeHTMLFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
final char[] buffer = termAtt.buffer();
final int bufferLength = termAtt.length();
String tokenText = new String(buffer);
tokenText = tokenText.replaceAll("<br", "");
tokenText = StringEscapeUtils.unescapeHtml(tokenText);
tokenText = tokenText.replaceAll("\\<.*?>", "");
int newLen = tokenText.toCharArray().length;
if (newLen < bufferLength) {
termAtt.copyBuffer(tokenText.toCharArray(), 0, newLen);
termAtt.setLength(newLen);
}
return true;
}
}